PyTorch walkthrough generated by modifying & combining several tutorials:

This notebeook will walk through:

  • Building and training a convolutional neural network (CNN) on the MNIST dataset
  • Visualization of the resulting learned filters

Dependencies (tested on):

  • torch: 0.3.0
  • torchvision
  • matplotlib

Part 1: Simple CNN for MNIST digit classification

In [73]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
%matplotlib inline
 
# If you have a GPU set this to true!
USE_CUDA = False
 
In [74]:
torch.manual_seed(1)    # reproducible
 
Out[74]:
<torch._C.Generator at 0x7f38bc057450>
In [75]:
# Hyper Parameters
EPOCH = 1               # train the training data n times, to save time, we just train 1 epoch
BATCH_SIZE = 50
LR = 0.001              # learning rate
DOWNLOAD_MNIST = True   # set to False if you have downloaded
 
In [76]:
# Mnist digits dataset
train_data = torchvision.datasets.MNIST(
    root='./mnist/',
    train=True,                                     # this is training data
    transform=torchvision.transforms.ToTensor(),    # Converts a PIL.Image or numpy.ndarray to
                                                    # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
    download=DOWNLOAD_MNIST,                        # download it if you don't have it
)
 
In [77]:
# plot one example
print(train_data.train_data.size())                 # (60000, 28, 28)
print(train_data.train_labels.size())               # (60000)
i=5
plt.imshow(train_data.train_data[i].numpy(), cmap='gray')
plt.title('%i' % train_data.train_labels[i])
plt.show()
 
torch.Size([60000, 28, 28])
torch.Size([60000])
In [78]:
# Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
 
In [79]:
# convert test data into Variable, pick 2000 samples to speed up testing
test_data = torchvision.datasets.MNIST(root='./mnist/', train=False)
test_x = Variable(torch.unsqueeze(test_data.test_data, dim=1)).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
test_y = test_data.test_labels[:2000]
 
In [80]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(         # input shape (1, 28, 28)
            nn.Conv2d(
                in_channels=1,              # input height
                out_channels=16,            # n_filters
                kernel_size=5,              # filter size
                stride=1,                   # filter movement/step
                padding=2,                  # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
            ),                              # output shape (16, 28, 28)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=2),    # choose max value in 2x2 area, output shape (16, 14, 14)
        )
        self.conv2 = nn.Sequential(         # input shape (1, 28, 28)
            nn.Conv2d(16, 32, 5, 1, 2),     # output shape (32, 14, 14)
            nn.ReLU(),                      # activation
            nn.MaxPool2d(2),                # output shape (32, 7, 7)
        )
        self.out = nn.Linear(32 * 7 * 7, 10)   # fully connected layer, output 10 classes
 
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)           # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
        output = self.out(x)
        return output, x    # return x for visualization
 
In [81]:
cnn = CNN()
print(cnn)  # net architecture
 
CNN(
  (conv1): Sequential(
    (0): Conv2d (1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (conv2): Sequential(
    (0): Conv2d (16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (out): Linear(in_features=1568, out_features=10)
)
In [82]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()                       # the target label is not one-hotted
 
In [83]:
# following function (plot_with_labels) is for visualization, can be ignored if not interested
from matplotlib import cm
try: from sklearn.manifold import TSNE; HAS_SK = True
except: HAS_SK = False; print('Please install sklearn for layer visualization')
def plot_with_labels(lowDWeights, labels):
    plt.cla()
    X, Y = lowDWeights[:, 0], lowDWeights[:, 1]
    for x, y, s in zip(X, Y, labels):
        c = cm.rainbow(int(255 * s / 9)); plt.text(x, y, s, backgroundcolor=c, fontsize=9)
    plt.xlim(X.min(), X.max()); plt.ylim(Y.min(), Y.max()); plt.title('Visualize last layer'); plt.show(); plt.pause(0.01)
 
plt.ion()
# training and testing
for epoch in range(EPOCH):
    for step, (x, y) in enumerate(train_loader):   # gives batch data, normalize x when iterate train_loader
        b_x = Variable(x)   # batch x
        b_y = Variable(y)   # batch y
 
        output = cnn(b_x)[0]               # cnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
 
        if step % 100 == 0:
            test_output, last_layer = cnn(test_x)
            pred_y = torch.max(test_output, 1)[1].data.squeeze()
            accuracy = sum(pred_y == test_y) / float(test_y.size(0))
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data[0], '| test accuracy: %.2f' % accuracy)
            if HAS_SK:
                # Visualization of trained flatten layer (T-SNE)
                tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
                plot_only = 500
                low_dim_embs = tsne.fit_transform(last_layer.data.numpy()[:plot_only, :])
                labels = test_y.numpy()[:plot_only]
                plot_with_labels(low_dim_embs, labels)
plt.ioff()
 
('Epoch: ', 0, '| train loss: 2.3101', '| test accuracy: 0.20')
('Epoch: ', 0, '| train loss: 0.6699', '| test accuracy: 0.88')
('Epoch: ', 0, '| train loss: 0.0701', '| test accuracy: 0.94')
('Epoch: ', 0, '| train loss: 0.1333', '| test accuracy: 0.94')
('Epoch: ', 0, '| train loss: 0.0401', '| test accuracy: 0.96')
('Epoch: ', 0, '| train loss: 0.0199', '| test accuracy: 0.96')
('Epoch: ', 0, '| train loss: 0.1071', '| test accuracy: 0.97')
('Epoch: ', 0, '| train loss: 0.0399', '| test accuracy: 0.97')
('Epoch: ', 0, '| train loss: 0.0794', '| test accuracy: 0.97')
('Epoch: ', 0, '| train loss: 0.1064', '| test accuracy: 0.97')
('Epoch: ', 0, '| train loss: 0.0295', '| test accuracy: 0.97')
('Epoch: ', 0, '| train loss: 0.0249', '| test accuracy: 0.98')
In [84]:
# print 10 predictions from test data
test_output, _ = cnn(test_x[:10])
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
print(pred_y, 'prediction number')
print(test_y[:10].numpy(), 'real number')
 
(array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9]), 'prediction number')
(array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9]), 'real number')
In [85]:
# [todo]: Statistics/confusion matrix using sklearn
 

Part 2: CNN filter visualization using pre-trained CNN

In [86]:
import scipy.misc
from PIL import Image
import json
from torchvision import models
from torchvision import transforms, utils
import numpy as np
 
In [87]:
def to_grayscale(image):
    """
    input is (d,w,h)
    converts 3D image tensor to grayscale images corresponding to each channel
    """
    image = torch.sum(image, dim=0)
    image = torch.div(image, image.shape[0])
    return image
 
 
In [88]:
def normalize(image):
    normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
    )
    preprocess = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    normalize
    ])
    
    if USE_CUDA:
        image = Variable(preprocess(image).unsqueeze(0).cuda())
    else:
        image = Variable(preprocess(image).unsqueeze(0))
    return image
 
 
def predict(image):
    _, index = vgg(image).data[0].max(0)
    return str(index[0]), labels[str(index[0])][1]
    
def deprocess(image):
    if USE_CUDA:
        return image * torch.Tensor([0.229, 0.224, 0.225]).cuda()  + torch.Tensor([0.485, 0.456, 0.406]).cuda()
    else:
        return image * torch.Tensor([0.229, 0.224, 0.225]) + torch.Tensor([0.485, 0.456, 0.406])
 
def load_image(path):
    image = Image.open(path)
    plt.imshow(image)
    plt.title("Image loaded successfully")
    return image
 
In [ ]:
kitten_1 = load_image("./images/Tongue-Kitten.jpg")
 
In [90]:
vgg = models.vgg16(pretrained=True)
 
In [91]:
if USE_CUDA:
    vgg = vgg.cuda() # if you want GPU
 
In [92]:
print(vgg)
 
VGG(
  (features): Sequential(
    (0): Conv2d (3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (5): Conv2d (64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (10): Conv2d (128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (17): Conv2d (256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace)
    (19): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace)
    (21): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): ReLU(inplace)
    (23): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (24): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace)
    (26): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace)
    (28): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace)
    (30): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (classifier): Sequential(
    (0): Linear(in_features=25088, out_features=4096)
    (1): ReLU(inplace)
    (2): Dropout(p=0.5)
    (3): Linear(in_features=4096, out_features=4096)
    (4): ReLU(inplace)
    (5): Dropout(p=0.5)
    (6): Linear(in_features=4096, out_features=1000)
  )
)
In [93]:
labels = json.load(open('labels/imagenet_class_index.json'))
 
In [94]:
 
kitten_2 = normalize(kitten_1)
print(predict(kitten_2))
 
 
 
('287', u'lynx')
In [95]:
modulelist = list(vgg.features.modules())
 

Visualize Output Maps

In [96]:
def layer_outputs(image):
    outputs = []
    names = []
    for layer in modulelist[1:]:
        image = layer(image)
        outputs.append(image)
        names.append(str(layer))
        
    output_im = []
    for i in outputs:
        i = i.squeeze(0)
        temp = to_grayscale(i)
        output_im.append(temp.data.cpu().numpy())
        
    fig = plt.figure()
    plt.rcParams["figure.figsize"] = (30, 50)
 
 
    for i in range(len(output_im)):
        a = fig.add_subplot(8,4,i+1)
        imgplot = plt.imshow(output_im[i])
        plt.axis('off')
        a.set_title(names[i].partition('(')[0], fontsize=30)
 
    plt.savefig('layer_outputs.jpg', bbox_inches='tight')
 
 
In [103]:
layer_outputs(kitten_2)
 

Output of each filter separately at given layer

In [98]:
def filter_outputs(image, layer_to_visualize):
    if layer_to_visualize < 0:
        layer_to_visualize += 31
    output = None
    name = None
    for count, layer in enumerate(modulelist[1:]):
        image = layer(image)
        if count == layer_to_visualize: 
            output = image
            name = str(layer)
    
    filters = []
    output = output.data.squeeze()
    for i in range(output.shape[0]):
        filters.append(output[i,:,:])
        
    fig = plt.figure()
    plt.rcParams["figure.figsize"] = (10, 10)
 
    for i in range(int(np.sqrt(len(filters))) * int(np.sqrt(len(filters)))):
        fig.add_subplot(np.sqrt(len(filters)), np.sqrt(len(filters)),i+1)
        imgplot = plt.imshow(filters[i])
        plt.axis('off')
    
#     print(len(filters))
#     print(filters[0].shape)
        
            
#     print(output.shape)
 
In [99]:
filter_outputs(kitten_2, 0)
 
In [100]:
filter_outputs(kitten_2, -1)
 

Visualize weights [todo]

Class Specific Saliency Maps

In [101]:
normalise = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
    )
preprocess = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    normalise
    ])
 
In [105]:
def make_saliency_map(input, label):
    if USE_CUDA:
        input = Variable(preprocess(input).unsqueeze(0).cuda(), requires_grad=True)
    else:
        input = Variable(preprocess(input).unsqueeze(0), requires_grad=True)
    output = vgg.forward(input)
    output[0][label].backward()
    grads = input.grad.data.clamp(min=0)
    grads.squeeze_()
    grads.transpose_(0,1)
    grads.transpose_(1,2)
    grads = np.amax(grads.cpu().numpy(), axis=2)
    
    true_image = input.data
    true_image = true_image.squeeze()
    true_image = true_image.transpose(0,1)
    true_image = true_image.transpose(1,2)
    true_image = deprocess(true_image)
 
    fig = plt.figure()
    plt.rcParams["figure.figsize"] = (20, 20)
 
 
    a = fig.add_subplot(1,2,1)
    imgplot = plt.imshow(true_image)
    plt.title('Original Image')
    plt.axis('off') 
 
    a = fig.add_subplot(1,2,2)
    imgplot = plt.imshow(grads)
    plt.axis('off') 
    plt.title('Saliency Map')
    
    return grads
 
In [106]:
dog = load_image('images/Golden_retr.jpg')
dog_sal = make_saliency_map(dog, 207)
 
In [107]:
goldfish = load_image('images/goldfish.jpg')
goldfish_sal = make_saliency_map(goldfish, 1)
 

SmoothGrad

In [111]:
def smooth_grad(input, label, x=10, percent_noise=10):
    """
    The apparent noise one sees in a sensitivity map may be due to 
    essentially meaningless local variations in partial derivatives.
    After all, given typical training techniques there is no reason to expect derivatives to vary smoothly.
    """
    if USE_CUDA:
        tensor_input = torch.from_numpy(np.array(input)).type(torch.cuda.FloatTensor) # input is now of shape (w,h,c)
    else:
        tensor_input = torch.from_numpy(np.array(input)).type(torch.FloatTensor) # input is now of shape (w,h,c)
    # x is the sample size
    if USE_CUDA:
        final_grad = torch.zeros((1,3,224,224)).cuda()
    else:
        final_grad = torch.zeros((1,3,224,224))
    for i in range(x):
        print('Sample:', i+1)
        temp_input = tensor_input
        # According to the paper, noise level corrresponds to stddev/(xmax-xmin). Hence stddev = noise_percentage * (max-min) /100
        if USE_CUDA:
            noise = torch.from_numpy(np.random.normal(loc=0, scale=(percent_noise/100) * (tensor_input.max() - tensor_input.min()), size=temp_input.shape)).type(torch.cuda.FloatTensor)
        else:
            noise = torch.from_numpy(np.random.normal(loc=0, scale=(percent_noise/100) * (tensor_input.max() - tensor_input.min()), size=temp_input.shape)).type(torch.FloatTensor)
        temp_input = (temp_input + noise).cpu().numpy()
        temp_input = Image.fromarray(temp_input.astype(np.uint8))
        if USE_CUDA:
            temp_input = Variable(preprocess(temp_input).unsqueeze(0).cuda(), requires_grad=True)
        else:
            temp_input = Variable(preprocess(temp_input).unsqueeze(0), requires_grad=True)
 
        output = vgg.forward(temp_input)
        output[0][label].backward()
        final_grad += temp_input.grad.data
    
    grads = final_grad/x
    grads = grads.clamp(min=0)
    grads.squeeze_()
    grads.transpose_(0,1)
    grads.transpose_(1,2)
    grads = np.amax(grads.cpu().numpy(), axis=2)
    
    true_image = normalize(input)
    true_image = true_image.squeeze()
    true_image = true_image.transpose(0,1)
    true_image = true_image.transpose(1,2)
    true_image = deprocess(true_image.data)
 
    fig = plt.figure()
    plt.rcParams["figure.figsize"] = (20, 20)
 
 
    a = fig.add_subplot(1,2,1)
    imgplot = plt.imshow(true_image)
    plt.title('Original Image')
    plt.axis('off') 
 
    a = fig.add_subplot(1,2,2)
    imgplot = plt.imshow(grads)
    plt.axis('off')  
    plt.title('SmoothGrad, Noise: ' + str(percent_noise) + '%, ' + 'Samples: ' + str(x))
    
    return grads
 
In [112]:
dog_sg = load_image('images/Golden_retr.jpg')
dog_sal = make_saliency_map(dog_sg, 1)
dog_sg_sal = smooth_grad(dog, 207, 30, 10)
 
 
('Sample:', 1)
('Sample:', 2)
('Sample:', 3)
('Sample:', 4)
('Sample:', 5)
('Sample:', 6)
('Sample:', 7)
('Sample:', 8)
('Sample:', 9)
('Sample:', 10)
('Sample:', 11)
('Sample:', 12)
('Sample:', 13)
('Sample:', 14)
('Sample:', 15)
('Sample:', 16)
('Sample:', 17)
('Sample:', 18)
('Sample:', 19)
('Sample:', 20)
('Sample:', 21)
('Sample:', 22)
('Sample:', 23)
('Sample:', 24)
('Sample:', 25)
('Sample:', 26)
('Sample:', 27)
('Sample:', 28)
('Sample:', 29)
('Sample:', 30)
In [113]:
goldfish_sg = load_image('images/goldfish.jpg')
godlfish_sal = make_saliency_map(goldfish_sg, 1)
goldfish_sg_sal = smooth_grad(goldfish, 1, 30, 10)
 
 
 
('Sample:', 1)
('Sample:', 2)
('Sample:', 3)
('Sample:', 4)
('Sample:', 5)
('Sample:', 6)
('Sample:', 7)
('Sample:', 8)
('Sample:', 9)
('Sample:', 10)
('Sample:', 11)
('Sample:', 12)
('Sample:', 13)
('Sample:', 14)
('Sample:', 15)
('Sample:', 16)
('Sample:', 17)
('Sample:', 18)
('Sample:', 19)
('Sample:', 20)
('Sample:', 21)
('Sample:', 22)
('Sample:', 23)
('Sample:', 24)
('Sample:', 25)
('Sample:', 26)
('Sample:', 27)
('Sample:', 28)
('Sample:', 29)
('Sample:', 30)